Load required packages
Load the dataset
file_path <- "C:/Users/dongl/OneDrive/Desktop/Predict_melting_point/Predict_melting_point.xlsx - Descriptors - Copy.xlsx"
descriptor_data <- readxl::read_excel(file_path)
Data cleaning # Rename columns Ave °C target_var # Handle missing value rows # Handle duplicate rows # Remove columns with more than 95% zero values # store clean data in descriptor_data_cleaned
## Number of rows removed due to missing values: 5
## Number of duplicate rows found: 0
## tibble [2,701 × 151] (S3: tbl_df/tbl/data.frame)
## $ target_var : num [1:2701] -161.5 -117 -23 -80.8 -73 ...
## $ SMILES : chr [1:2701] "C1(CCC1)C" "CN(C)C" "ClC(Cl)(Cl)Cl" "C#C" ...
## $ MaxAbsEStateIndex : num [1:2701] 2.31 2 4.83 4 5.31 2.21 3.83 3.8 5.47 5.4 ...
## $ MaxEStateIndex : num [1:2701] 2.31 2 4.83 4 5.31 2.21 3.83 3.8 5.47 5.4 ...
## $ MinAbsEStateIndex : num [1:2701] 1.06 2 1.61 4 0.2 1.27 0.31 0.78 0.86 0.89 ...
## $ MinEStateIndex : num [1:2701] 1.06 2 -1.61 4 -1.72 1.27 0.31 0.78 0.86 0.89 ...
## $ qed : num [1:2701] 0.41 0.38 0.47 0.33 0.56 0.48 0.41 0.47 0.49 0.43 ...
## $ SPS : num [1:2701] 25.8 9 12 1 11.3 ...
## $ MolWt : num [1:2701] 70.1 59.1 153.8 26 248.8 ...
## $ HeavyAtomMolWt : num [1:2701] 60.1 50 153.8 24 248.8 ...
## $ ExactMolWt : num [1:2701] 70.1 59.1 151.9 26 245.8 ...
## $ NumValenceElectrons : num [1:2701] 30 26 32 10 54 48 42 42 32 30 ...
## $ MaxPartialCharge : num [1:2701] -0.04 -0.01 0.27 -0.12 0.23 -0.03 -0.02 -0.03 0.09 0.03 ...
## $ MinPartialCharge : num [1:2701] -0.06 -0.31 -0.07 -0.12 -0.08 -0.08 -0.1 -0.1 -0.13 -0.13 ...
## $ MaxAbsPartialCharge : num [1:2701] 0.06 0.31 0.27 0.12 0.23 0.08 0.1 0.1 0.13 0.13 ...
## $ MinAbsPartialCharge : num [1:2701] 0.04 0.01 0.07 0.12 0.08 0.03 0.02 0.03 0.09 0.03 ...
## $ FpDensityMorgan1 : num [1:2701] 1.4 1 0.8 1 0.89 1.13 1.29 1.57 1.67 1.6 ...
## $ FpDensityMorgan2 : num [1:2701] 1.8 1 0.8 1 1.22 1.63 1.43 2 2.5 2.2 ...
## $ FpDensityMorgan3 : num [1:2701] 1.8 1 0.8 1 1.22 1.75 1.43 2 2.67 2.2 ...
## $ BCUT2D_MWHI : num [1:2701] 14.2 15 35.6 12.6 35.6 ...
## $ BCUT2D_MWLOW : num [1:2701] 9.88 11.01 11.84 11.43 10.71 ...
## $ BCUT2D_CHGHI : num [1:2701] 2.09 1.58 2.11 0.45 2.26 1.87 2.08 1.9 1.77 2.18 ...
## $ BCUT2D_CHGLO : num [1:2701] -2.18 -1.9 -1.91 -0.7 -2.07 -1.98 -2.16 -1.99 -1.62 -1.7 ...
## $ BCUT2D_LOGPHI : num [1:2701] 2.24 1.47 2.29 0.58 2.4 2.07 2.21 2.05 2.13 2.29 ...
## $ BCUT2D_LOGPLOW : num [1:2701] -2.04 -1.99 -1.81 -0.58 -2.01 -1.77 -2.04 -1.84 -1.29 -1.67 ...
## $ BCUT2D_MRHI : num [1:2701] 4.62 4.09 6.83 4.47 6.75 5.05 4.99 4.88 7.14 6.18 ...
## $ BCUT2D_MRLOW : num [1:2701] 0.35 0.51 1.76 3.31 1.42 0.89 0.51 0.64 2.02 0.99 ...
## $ AvgIpc : num [1:2701] 1.3 0.81 0.72 1 1.62 1.67 1.31 1.27 1.86 2.12 ...
## $ BalabanJ : num [1:2701] 2.08 2.32 3.02 3 4.4 3.62 3.79 3.13 3.05 2 ...
## $ BertzCT : num [1:2701] 27.02 8 19.12 4.75 125.73 ...
## $ Chi0 : num [1:2701] 3.7 3.58 4.5 2 7.65 6.57 6.08 5.86 4.41 3.7 ...
## $ Chi0n : num [1:2701] 3.7 3.45 2.01 1.15 3.77 6.41 5.71 5.49 3.02 3.08 ...
## $ Chi0v : num [1:2701] 3.7 3.45 5.04 1.15 8.3 6.41 5.71 5.49 4.59 3.83 ...
## $ Chi1 : num [1:2701] 2.39 1.73 2 1 3.85 3.68 2.94 3.13 2.89 2.43 ...
## $ Chi1n : num [1:2701] 2.39 1.34 0.76 0.33 1.63 3.31 2.6 2.77 1.58 1.99 ...
## $ Chi1v : num [1:2701] 2.39 1.34 2.27 0.33 3.9 3.31 2.6 2.77 2.84 2.53 ...
## $ Chi2n : num [1:2701] 2.04 1.34 0.43 0 1.07 2.53 3.03 2.56 0.92 1.6 ...
## $ Chi2v : num [1:2701] 2.04 1.34 3.86 0 4.68 2.53 3.03 2.56 2.5 1.91 ...
## $ Chi3n : num [1:2701] 1.39 0 0 0 0.41 1.48 1.28 0.76 0.49 0.92 ...
## $ Chi3v : num [1:2701] 1.39 0 0 0 2.32 1.48 1.28 0.76 1.53 1.35 ...
## $ Chi4n : num [1:2701] 0.61 0 0 0 0.11 0.63 0 0.7 0.26 0.27 ...
## $ Chi4v : num [1:2701] 0.61 0 0 0 0.96 0.63 0 0.7 0.93 0.58 ...
## $ HallKierAlpha : num [1:2701] 0 -0.04 1.16 -0.44 1.48 -0.26 -0.26 -0.26 -0.01 0.29 ...
## $ Ipc : num [1:2701] 10.39 3.25 3.61 2 50.07 ...
## $ Kappa1 : num [1:2701] 3.2 3.96 6.16 1.56 10.48 ...
## $ Kappa2 : num [1:2701] 1 1.3 1.74 0.56 3.75 3.71 1.69 2.84 1.63 1.17 ...
## $ Kappa3 : num [1:2701] 0.44 1128.96 38.29 -4.71 4.02 ...
## $ LabuteASA : num [1:2701] 33.2 27.2 50 14.1 82.6 ...
## $ PEOE_VSA1 : num [1:2701] 0 4.9 0 0 0 0 0 0 0 0 ...
## $ PEOE_VSA10 : num [1:2701] 0 0 0 0 4.49 0 0 0 0 0 ...
## $ PEOE_VSA11 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ PEOE_VSA12 : num [1:2701] 0 0 0 0 3.79 0 0 0 0 0 ...
## $ PEOE_VSA13 : num [1:2701] 0 0 3.25 0 0 0 0 0 0 0 ...
## $ PEOE_VSA14 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ PEOE_VSA2 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ PEOE_VSA3 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ PEOE_VSA4 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ PEOE_VSA5 : num [1:2701] 0 0 0 12.8 0 ...
## $ PEOE_VSA6 : num [1:2701] 26.2 0 46.4 0 69.6 ...
## $ PEOE_VSA7 : num [1:2701] 5.92 21.14 0 0 0 ...
## $ PEOE_VSA8 : num [1:2701] 0 0 0 0 0 0 0 0 0 5.88 ...
## $ PEOE_VSA9 : num [1:2701] 0 0 0 0 5.03 0 0 0 4.34 0 ...
## $ SMR_VSA1 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ SMR_VSA10 : num [1:2701] 0 0 46.4 0 69.6 ...
## $ SMR_VSA3 : num [1:2701] 0 4.9 0 0 0 0 0 0 0 0 ...
## $ SMR_VSA4 : num [1:2701] 5.92 0 0 0 0 0 5.41 5.92 0 5.92 ...
## $ SMR_VSA5 : num [1:2701] 26.19 0 3.25 0 3.79 ...
## $ SMR_VSA6 : num [1:2701] 0 21.1 0 0 0 ...
## $ SMR_VSA7 : num [1:2701] 0 0 0 0 9.52 ...
## $ SMR_VSA9 : num [1:2701] 0 0 0 12.8 0 ...
## $ SlogP_VSA1 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ SlogP_VSA10 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ SlogP_VSA11 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ SlogP_VSA12 : num [1:2701] 0 0 46.4 0 69.6 ...
## $ SlogP_VSA2 : num [1:2701] 0 26.04 3.25 0 3.79 ...
## $ SlogP_VSA3 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ SlogP_VSA4 : num [1:2701] 5.92 0 0 12.85 0 ...
## $ SlogP_VSA5 : num [1:2701] 26.2 0 0 0 0 ...
## $ SlogP_VSA6 : num [1:2701] 0 0 0 0 9.52 ...
## $ SlogP_VSA7 : num [1:2701] 0 0 0 0 0 0 0 0 4.34 0 ...
## $ SlogP_VSA8 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ TPSA : num [1:2701] 0 3.24 0 0 0 0 0 0 0 0 ...
## $ EState_VSA1 : num [1:2701] 0 0 3.25 0 3.79 0 0 0 0 0 ...
## $ EState_VSA10 : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
## $ EState_VSA2 : num [1:2701] 0 0 0 0 9.52 0 0 0 0 0 ...
## $ EState_VSA3 : num [1:2701] 0 0 0 0 0 0 5.41 0 0 0 ...
## $ EState_VSA4 : num [1:2701] 5.92 0 0 0 0 0 0 5.92 4.34 11.8 ...
## $ EState_VSA5 : num [1:2701] 19.3 0 0 0 0 ...
## $ EState_VSA6 : num [1:2701] 0 0 0 0 0 ...
## $ EState_VSA7 : num [1:2701] 0 26 0 0 0 ...
## $ EState_VSA8 : num [1:2701] 6.92 0 0 12.85 0 ...
## $ EState_VSA9 : num [1:2701] 0 0 46.4 0 69.6 ...
## $ VSA_EState1 : num [1:2701] 0 0 -1.61 0 -1.96 0 0 0 0.86 0 ...
## $ VSA_EState10 : num [1:2701] 0 0 19.3 0 31.4 ...
## $ VSA_EState2 : num [1:2701] 0 2 0 0 0 0 0 0 0 0 ...
## $ VSA_EState3 : num [1:2701] 0 0 0 0 -0.2 0 0 0 1.95 0 ...
## $ VSA_EState4 : num [1:2701] 0 0 0 0 0 3.03 1.55 1.29 0 0 ...
## $ VSA_EState5 : num [1:2701] 1.06 0 0 0 0 0 0 0.78 0 1.79 ...
## $ VSA_EState6 : num [1:2701] 0 0 0 0 0 0 0 0 3.79 0 ...
## [list output truncated]
## - attr(*, "na.action")= 'omit' Named int [1:5] 277 337 686 1134 2608
## ..- attr(*, "names")= chr [1:5] "277" "337" "686" "1134" ...
# Calculate column statistics for all columns except 'target_var'
columns_to_analyze <- setdiff(names(descriptor_data_cleaned), c('D1')) # Assuming D1 represents 'target_var'
column_stats <- data.frame(Column = columns_to_analyze, Count = numeric(length(columns_to_analyze)),
Distinct = numeric(length(columns_to_analyze)), Min = numeric(length(columns_to_analyze)),
Max = numeric(length(columns_to_analyze)), Average = numeric(length(columns_to_analyze)),
Std_Dev = numeric(length(columns_to_analyze)), Range = numeric(length(columns_to_analyze)),
Zero_Values = numeric(length(columns_to_analyze)), stringsAsFactors = FALSE)
for (i in seq_along(columns_to_analyze)) {
col <- columns_to_analyze[i]
column_data <- descriptor_data_cleaned[[col]]
if (is.numeric(column_data)) {
column_stats[i, ] <- list(
Column = col,
Count = length(column_data),
Distinct = length(unique(column_data)),
Min = min(column_data, na.rm = TRUE),
Max = max(column_data, na.rm = TRUE),
Average = mean(column_data, na.rm = TRUE),
Std_Dev = sd(column_data, na.rm = TRUE),
Range = max(column_data, na.rm = TRUE) - min(column_data, na.rm = TRUE),
Zero_Values = sum(column_data == 0)
)
}
}
print(column_stats)
## Column Count Distinct Min Max Average Std_Dev Range Zero_Values
## 1 target_var 2701 1902 -187.75 4.376500e+02 6.014403e+01 9.346898e+01 6.254000e+02 1
## 2 SMILES 0 0 0.00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0
## 3 MaxAbsEStateIndex 2701 794 0.00 1.382000e+01 8.163965e+00 3.356041e+00 1.382000e+01 1
## 4 MaxEStateIndex 2701 794 0.00 1.382000e+01 8.163961e+00 3.356047e+00 1.382000e+01 1
## 5 MinAbsEStateIndex 2701 192 0.00 7.810000e+00 5.138282e-01 5.218991e-01 7.810000e+00 28
## 6 MinEStateIndex 2701 469 -8.73 4.000000e+00 -1.501814e-01 1.270644e+00 1.273000e+01 9
## 7 qed 2701 85 0.03 9.100000e-01 5.504258e-01 1.344203e-01 8.800000e-01 0
## 8 SPS 2701 537 0.00 5.246000e+01 1.089180e+01 4.084920e+00 5.246000e+01 1
## 9 MolWt 2701 1385 16.04 9.591700e+02 1.760413e+02 7.577216e+01 9.431300e+02 0
## 10 HeavyAtomMolWt 2701 915 12.01 9.591700e+02 1.654050e+02 7.261552e+01 9.471600e+02 0
## 11 ExactMolWt 2701 1411 16.03 9.491800e+02 1.756890e+02 7.552626e+01 9.331500e+02 0
## 12 NumValenceElectrons 2701 93 8.00 3.380000e+02 6.203036e+01 2.663677e+01 3.300000e+02 0
## 13 MaxPartialCharge 2701 69 -0.12 9.500000e-01 1.735024e-01 1.367367e-01 1.070000e+00 72
## 14 MinPartialCharge 2701 52 -0.63 -1.000000e-02 -3.288856e-01 1.514091e-01 6.200000e-01 0
## 15 MaxAbsPartialCharge 2701 62 0.03 9.500000e-01 3.354572e-01 1.511506e-01 9.200000e-01 0
## 16 MinAbsPartialCharge 2701 49 0.00 4.900000e-01 1.737579e-01 1.196618e-01 4.900000e-01 72
## 17 FpDensityMorgan1 2701 146 0.11 2.000000e+00 1.138371e+00 3.719377e-01 1.890000e+00 0
## 18 FpDensityMorgan2 2701 186 0.18 2.670000e+00 1.655046e+00 4.731753e-01 2.490000e+00 0
## 19 FpDensityMorgan3 2701 221 0.25 3.220000e+00 2.034546e+00 5.595734e-01 2.970000e+00 0
## 20 BCUT2D_MWHI 2701 272 12.01 1.269300e+02 2.714047e+01 2.253253e+01 1.149200e+02 0
## 21 BCUT2D_MWLOW 2701 184 9.49 1.201000e+01 1.025002e+01 3.151043e-01 2.520000e+00 0
## 22 BCUT2D_CHGHI 2701 147 -0.08 2.950000e+00 2.003436e+00 2.048932e-01 3.030000e+00 0
## 23 BCUT2D_CHGLO 2701 129 -2.56 -8.000000e-02 -1.978571e+00 1.697027e-01 2.480000e+00 0
## 24 BCUT2D_LOGPHI 2701 145 0.14 2.630000e+00 2.107593e+00 2.119771e-01 2.490000e+00 0
## 25 BCUT2D_LOGPLOW 2701 156 -2.88 1.400000e-01 -1.945539e+00 2.367239e-01 3.020000e+00 0
## 26 BCUT2D_MRHI 2701 361 2.50 1.428000e+01 6.256309e+00 1.652226e+00 1.178000e+01 0
## 27 BCUT2D_MRLOW 2701 258 -1.11 4.670000e+00 4.202555e-01 6.320404e-01 5.780000e+00 4
## 28 AvgIpc 2701 173 0.00 3.580000e+00 1.986246e+00 3.938985e-01 3.580000e+00 1
## 29 BalabanJ 2701 270 0.00 6.610000e+00 2.827016e+00 5.131153e-01 6.610000e+00 1
## 30 BertzCT 2701 1693 0.00 1.935570e+03 2.657365e+02 2.037001e+02 1.935570e+03 4
## 31 Chi0 2701 423 0.00 4.154000e+01 8.606549e+00 3.510255e+00 4.154000e+01 1
## 32 Chi0n 2701 887 0.00 3.864000e+01 6.661999e+00 3.077225e+00 3.864000e+01 1
## 33 Chi0v 2701 932 0.00 3.864000e+01 7.191896e+00 3.108888e+00 3.864000e+01 1
## 34 Chi1 2701 525 0.00 2.803000e+01 5.498308e+00 2.490393e+00 2.803000e+01 1
## 35 Chi1n 2701 688 0.00 2.554000e+01 3.717397e+00 2.021705e+00 2.554000e+01 1
## 36 Chi1v 2701 728 0.00 2.618000e+01 4.104180e+00 2.156157e+00 2.618000e+01 1
## 37 Chi2n 2701 588 0.00 1.746000e+01 2.612525e+00 1.523367e+00 1.746000e+01 10
## 38 Chi2v 2701 643 0.00 2.662000e+01 3.058541e+00 1.897245e+00 2.662000e+01 10
## 39 Chi3n 2701 452 0.00 1.161000e+01 1.630563e+00 1.120048e+00 1.161000e+01 73
## 40 Chi3v 2701 509 0.00 2.833000e+01 1.958597e+00 1.589385e+00 2.833000e+01 73
## 41 Chi4n 2701 344 0.00 7.760000e+00 1.014558e+00 8.281185e-01 7.760000e+00 193
## 42 Chi4v 2701 388 0.00 3.150000e+01 1.244017e+00 1.424883e+00 3.150000e+01 193
## 43 HallKierAlpha 2701 346 -5.53 3.040000e+00 -9.939689e-01 8.631283e-01 8.570000e+00 102
## 44 Ipc 2701 904 0.00 1.270636e+13 5.169880e+09 2.456623e+11 1.270636e+13 1
## 45 Kappa1 2701 922 0.00 5.541000e+01 8.614528e+00 3.981716e+00 5.541000e+01 1
## 46 Kappa2 2701 714 0.00 4.709000e+01 4.077205e+00 3.387920e+00 4.709000e+01 2
## 47 Kappa3 2701 619 -27.04 9.507960e+03 1.226420e+01 2.257735e+02 9.535000e+03 6
## 48 LabuteASA 2701 1696 8.74 3.548100e+02 7.174526e+01 2.977018e+01 3.460700e+02 0
## 49 PEOE_VSA1 2701 115 0.00 3.041000e+01 4.550944e+00 4.748006e+00 3.041000e+01 1105
## 50 PEOE_VSA10 2701 147 -0.06 3.448000e+01 2.539941e+00 4.384323e+00 3.454000e+01 1843
## 51 PEOE_VSA11 2701 77 0.00 4.654000e+01 1.451466e+00 3.938004e+00 4.654000e+01 2260
## 52 PEOE_VSA12 2701 64 0.00 3.490000e+01 9.226435e-01 2.918698e+00 3.490000e+01 2382
## 53 PEOE_VSA13 2701 44 0.00 2.024000e+01 8.589448e-01 2.543814e+00 2.024000e+01 2381
## 54 PEOE_VSA14 2701 66 0.00 4.789000e+01 1.968256e+00 3.904804e+00 4.789000e+01 1996
## 55 PEOE_VSA2 2701 74 0.00 3.959000e+01 3.004198e+00 4.650954e+00 3.959000e+01 1657
## 56 PEOE_VSA3 2701 68 0.00 4.390000e+01 1.814061e+00 3.994742e+00 4.390000e+01 2034
## 57 PEOE_VSA4 2701 40 0.00 7.903000e+01 1.023265e+00 3.831925e+00 7.903000e+01 2412
## 58 PEOE_VSA5 2701 41 0.00 4.640000e+01 9.953536e-01 3.751215e+00 4.640000e+01 2478
## 59 PEOE_VSA6 2701 375 0.00 2.835200e+02 2.313013e+01 2.661100e+01 2.835200e+02 670
## 60 PEOE_VSA7 2701 380 0.00 1.593000e+02 1.704308e+01 1.366446e+01 1.593000e+02 381
## 61 PEOE_VSA8 2701 342 0.00 6.533000e+01 6.731229e+00 7.461736e+00 6.533000e+01 1040
## 62 PEOE_VSA9 2701 257 0.00 5.286000e+01 5.006709e+00 6.751778e+00 5.286000e+01 1397
## 63 SMR_VSA1 2701 182 0.00 7.903000e+01 6.868097e+00 7.140983e+00 7.903000e+01 925
## 64 SMR_VSA10 2701 387 0.00 1.593000e+02 1.233369e+01 1.282037e+01 1.593000e+02 679
## 65 SMR_VSA3 2701 59 0.00 2.582000e+01 1.378767e+00 3.415393e+00 2.582000e+01 2219
## 66 SMR_VSA4 2701 33 0.00 3.173000e+01 9.714291e-01 2.769780e+00 3.173000e+01 2338
## 67 SMR_VSA5 2701 280 -0.06 2.965500e+02 1.471167e+01 2.530551e+01 2.966100e+02 1054
## 68 SMR_VSA6 2701 152 0.00 5.732000e+01 4.180300e+00 7.372412e+00 5.732000e+01 1691
## 69 SMR_VSA7 2701 446 0.00 2.426500e+02 2.770715e+01 2.301097e+01 2.426500e+02 725
## 70 SMR_VSA9 2701 56 0.00 4.556000e+01 2.626797e+00 5.138712e+00 4.556000e+01 1969
## 71 SlogP_VSA1 2701 90 0.00 4.149000e+01 2.356649e+00 4.160890e+00 4.149000e+01 1874
## 72 SlogP_VSA10 2701 48 0.00 7.903000e+01 2.285054e+00 5.064532e+00 7.903000e+01 2012
## 73 SlogP_VSA11 2701 9 0.00 2.300000e+01 1.565046e+00 3.706333e+00 2.300000e+01 2202
## 74 SlogP_VSA12 2701 80 0.00 1.593000e+02 6.197627e+00 1.210005e+01 1.593000e+02 1900
## 75 SlogP_VSA2 2701 542 -0.06 9.962000e+01 1.032320e+01 9.912812e+00 9.968000e+01 613
## 76 SlogP_VSA3 2701 116 0.00 3.228000e+01 3.197146e+00 4.984146e+00 3.228000e+01 1687
## 77 SlogP_VSA4 2701 80 0.00 5.817000e+01 3.714136e+00 6.570836e+00 5.817000e+01 1822
## 78 SlogP_VSA5 2701 258 0.00 2.904500e+02 1.661442e+01 2.480933e+01 2.904500e+02 759
## 79 SlogP_VSA6 2701 245 0.00 2.426500e+02 2.191129e+01 2.053384e+01 2.426500e+02 755
## 80 SlogP_VSA7 2701 26 0.00 5.238000e+01 1.021625e+00 3.725404e+00 5.238000e+01 2395
## 81 SlogP_VSA8 2701 64 0.00 6.463000e+01 1.855302e+00 5.583768e+00 6.463000e+01 2344
## 82 TPSA 2701 357 0.00 2.058400e+02 3.224628e+01 2.661784e+01 2.058400e+02 574
## 83 EState_VSA1 2701 254 0.00 6.930000e+01 3.877297e+00 7.107046e+00 6.930000e+01 1738
## 84 EState_VSA10 2701 75 0.00 7.903000e+01 4.689278e+00 6.195649e+00 7.903000e+01 1249
## 85 EState_VSA2 2701 255 0.00 6.135000e+01 4.412133e+00 6.151865e+00 6.135000e+01 1456
## 86 EState_VSA3 2701 263 0.00 7.585000e+01 5.004228e+00 6.897207e+00 7.585000e+01 1357
## 87 EState_VSA4 2701 329 -0.06 7.304000e+01 6.781866e+00 8.798462e+00 7.310000e+01 1175
## 88 EState_VSA5 2701 266 0.00 2.696700e+02 1.160126e+01 2.098993e+01 2.696700e+02 1186
## 89 EState_VSA6 2701 129 0.00 7.280000e+01 7.177453e+00 1.052613e+01 7.280000e+01 1531
## 90 EState_VSA7 2701 138 0.00 9.878000e+01 9.438038e+00 1.508731e+01 9.878000e+01 1560
## 91 EState_VSA8 2701 353 0.00 1.941200e+02 1.143315e+01 1.778714e+01 1.941200e+02 1164
## 92 EState_VSA9 2701 143 0.00 1.160100e+02 6.625409e+00 9.298870e+00 1.160100e+02 1143
## 93 VSA_EState1 2701 693 -2.17 2.206400e+02 4.352107e+00 1.165540e+01 2.228100e+02 1661
## 94 VSA_EState10 2701 484 -0.82 6.086000e+01 2.017930e+00 4.383032e+00 6.168000e+01 1930
## 95 VSA_EState2 2701 857 -1.90 5.838000e+01 6.977001e+00 8.001439e+00 6.028000e+01 1148
## 96 VSA_EState3 2701 897 -3.66 4.021000e+01 5.022847e+00 6.458722e+00 4.387000e+01 1120
## 97 VSA_EState4 2701 714 -6.44 2.046000e+01 1.405935e+00 2.453432e+00 2.690000e+01 950
## 98 VSA_EState5 2701 472 -52.81 1.046000e+01 -1.038467e-01 2.111076e+00 6.327000e+01 1182
## 99 VSA_EState6 2701 1081 -1.37 8.417000e+01 5.867523e+00 6.661747e+00 8.554000e+01 875
## 100 VSA_EState7 2701 726 -16.43 6.238000e+01 1.765391e+00 5.014952e+00 7.881000e+01 1501
## 101 VSA_EState8 2701 655 -4.84 2.316000e+01 1.798967e+00 2.748039e+00 2.800000e+01 1328
## 102 VSA_EState9 2701 272 -15.55 8.260000e+00 1.360459e-01 1.211063e+00 2.381000e+01 2281
## 103 FractionCSP3 2701 77 0.00 1.000000e+00 3.424361e-01 3.665076e-01 1.000000e+00 803
## 104 HeavyAtomCount 2701 42 1.00 5.700000e+01 1.153906e+01 5.065904e+00 5.600000e+01 0
## 105 NHOHCount 2701 9 0.00 8.000000e+00 7.571270e-01 1.014100e+00 8.000000e+00 1464
## 106 NOCount 2701 13 0.00 1.400000e+01 2.041096e+00 1.743606e+00 1.400000e+01 574
## 107 NumAliphaticCarbocycles 2701 5 0.00 4.000000e+00 6.442058e-02 2.740579e-01 4.000000e+00 2540
## 108 NumAliphaticHeterocycles 2701 4 0.00 3.000000e+00 7.034432e-02 2.779785e-01 3.000000e+00 2526
## 109 NumAliphaticRings 2701 5 0.00 4.000000e+00 1.347649e-01 3.853549e-01 4.000000e+00 2371
## 110 NumAromaticCarbocycles 2701 8 0.00 8.000000e+00 8.130322e-01 8.179879e-01 8.000000e+00 1046
## 111 NumAromaticHeterocycles 2701 4 0.00 3.000000e+00 1.469826e-01 3.880855e-01 3.000000e+00 2335
## [ reached 'max' / getOption("max.print") -- omitted 40 rows ]
# Prepare data for plotting 'Distinct' values vs 'target_var'
column_stats_distinct <- column_stats[, c("Column", "Distinct")]
column_stats_distinct$target_var <- descriptor_data_cleaned$target_var[1:nrow(column_stats_distinct)]
# Apply log transformation to Distinct to enhance visibility of small changes
column_stats_distinct$Distinct <- log1p(column_stats_distinct$Distinct)
# Normalize the values separately
normalize <- function(x) {
return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
}
column_stats_distinct$Distinct <- normalize(column_stats_distinct$Distinct)
column_stats_distinct$target_var <- normalize(column_stats_distinct$target_var)
# Reshape data for plotting
melted_stats <- reshape2::melt(column_stats_distinct, id.vars = "Column")
# Create interactive plot with Plotly
p <- ggplot(melted_stats, aes(x = Column, y = value, color = variable, group = variable)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Distinct Values and 'target_var' Comparison (Log Transformed & Normalized)", x = "Descriptors", y = "Normalized Values", color = "Metric") +
theme_minimal()
# Convert ggplot to Plotly plot
p_plotly <- ggplotly(p)
p_plotly
Visualize column statistics average vs ‘target_var’
# Prepare data for plotting average vs 'target_var'
column_stats_avg <- column_stats[, c("Column", "Average")]
column_stats_avg$target_var <- descriptor_data_cleaned$target_var[1:nrow(column_stats_avg)]
# Apply log transformation to Average to enhance visibility of small changes
column_stats_avg$Average <- log1p(column_stats_avg$Average)
## Warning in log1p(column_stats_avg$Average): NaNs produced
# Normalize the values separately
column_stats_avg$Average <- normalize(column_stats_avg$Average)
column_stats_avg$target_var <- normalize(column_stats_avg$target_var)
# Reshape data for plotting average
melted_avg <- reshape2::melt(column_stats_avg, id.vars = "Column")
# Create interactive plot with Plotly
p_avg <- ggplot(melted_avg, aes(x = Column, y = value, color = variable, group = variable)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Average and 'target_var' Comparison (Log Transformed & Normalized)", x = "Descriptors", y = "Normalized Values", color = "Metric") +
theme_minimal()
# Convert ggplot to Plotly plot
p_avg_plotly <- ggplotly(p_avg)
p_avg_plotly
Visualize column statistics Standard Deviation vs ‘target_var’
# Prepare data for plotting standard deviation vs 'target_var'
column_stats_std <- column_stats[, c("Column", "Std_Dev")]
column_stats_std$target_var <- descriptor_data_cleaned$target_var[1:nrow(column_stats_std)]
# Apply log transformation to Std_Dev to enhance visibility of small changes
column_stats_std$Std_Dev <- log1p(column_stats_std$Std_Dev)
# Normalize the values separately
column_stats_std$Std_Dev <- normalize(column_stats_std$Std_Dev)
column_stats_std$target_var <- normalize(column_stats_std$target_var)
# Reshape data for plotting standard deviation
melted_std <- reshape2::melt(column_stats_std, id.vars = "Column")
# Create interactive plot with Plotly
p_std <- ggplot(melted_std, aes(x = Column, y = value, color = variable, group = variable)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Standard Deviation and 'target_var' Comparison (Log Transformed & Normalized)", x = "Descriptors", y = "Normalized Values", color = "Metric") +
theme_minimal()
# Convert ggplot to Plotly plot
p_std_plotly <- ggplotly(p_std)
p_std_plotly